#!/bin/sh

# --------------------------------------------------------------------------
# This script was adapted from ubconfc, a script to generate
# a C implementation of unicode property tables for GHC.
# Ubconfc was written by Dimitry Golubovsky (dimitry@golubovsky.org)
# as part of the Partial Unicode Support patch
#
# Adopted for use with GHC.
# License: see http://www.haskell.org/ghc/license
#
# Converted to JavaScript as part of unicode support in Haste.
# (c) Laszlo Pandy (laszlok2@gmail.com) and freely redistributable.
#
# -------------------------------------------------------------------------

# The script reads the unicode table from the standard input,
# and outputs JavaScript code to the standard output.
# The JavaScript code contains the chars property table,
# and basic functions to access properties.
# Example usage:
#    $ curl -o UnicodeData.txt http://www.unicode.org/Public/6.0.0/ucd/UnicodeData.txt
#    $ sh unicode-gen.sh < UnicodeData.txt > unicode.js

#   Output the file header

echo "/*-------------------------------------------------------------------------"
echo "This is an automatically generated file: do not edit"
echo "Generated by `basename $0` at `date`"
echo "-------------------------------------------------------------------------*/"

#   Define structures

cat <<EOF

/* Unicode general categories, listed in the same order as in the Unicode
 * standard -- this must be the same order as in GHC.Unicode.
 */

var NUMCAT_LU = 0;  /* Letter, Uppercase */
var NUMCAT_LL = 1;  /* Letter, Lowercase */
var NUMCAT_LT = 2;  /* Letter, Titlecase */
var NUMCAT_LM = 3;  /* Letter, Modifier */
var NUMCAT_LO = 4;  /* Letter, Other */
var NUMCAT_MN = 5;  /* Mark, Non-Spacing */
var NUMCAT_MC = 6;  /* Mark, Spacing Combining */
var NUMCAT_ME = 7;  /* Mark, Enclosing */
var NUMCAT_ND = 8;  /* Number, Decimal */
var NUMCAT_NL = 9;  /* Number, Letter */
var NUMCAT_NO = 10;  /* Number, Other */
var NUMCAT_PC = 11;  /* Punctuation, Connector */
var NUMCAT_PD = 12;  /* Punctuation, Dash */
var NUMCAT_PS = 13;  /* Punctuation, Open */
var NUMCAT_PE = 14;  /* Punctuation, Close */
var NUMCAT_PI = 15;  /* Punctuation, Initial quote */
var NUMCAT_PF = 16;  /* Punctuation, Final quote */
var NUMCAT_PO = 17;  /* Punctuation, Other */
var NUMCAT_SM = 18;  /* Symbol, Math */
var NUMCAT_SC = 19;  /* Symbol, Currency */
var NUMCAT_SK = 20;  /* Symbol, Modifier */
var NUMCAT_SO = 21;  /* Symbol, Other */
var NUMCAT_ZS = 22;  /* Separator, Space */
var NUMCAT_ZL = 23;  /* Separator, Line */
var NUMCAT_ZP = 24;  /* Separator, Paragraph */
var NUMCAT_CC = 25;  /* Other, Control */
var NUMCAT_CF = 26;  /* Other, Format */
var NUMCAT_CS = 27;  /* Other, Surrogate */
var NUMCAT_CO = 28;  /* Other, Private Use */
var NUMCAT_CN = 29;  /* Other, Not Assigned */


/* struct _convrule_ */
function _convrule_(category, catnumber, possible, updist, lowdist, titledist) {
	return {
		category: category, /* unsigned int */
		catnumber: catnumber, /* unsigned int */
		possible: possible, /* int */
		updist: updist, /* int */
		lowdist: lowdist, /* int */
		titledist: titledist /* int */
	};
};

/* struct _charblock_ */
function _charblock_(start, length, rule) {
	return {
		start: start, /* int */
		length: length, /* int */
		rule: rule /* struct _convrule_ */
	};
}

EOF

#   Convert the stdin file to the C table

awk '
BEGIN {
	FS=";"
	catidx=0
	rulidx=0
	blockidx=0
	cblckidx=0
	sblckidx=0
	blockb=-1
	blockl=0
	digs="0123456789ABCDEF"
	for(i=0;i<16;i++)
	{
		hex[substr(digs,i+1,1)]=i;
	}
}
function em1(a)
{
	if(a=="") return "-1"
	return "0x"a
}
function h2d(a)
{
	l=length(a)
	acc=0
	for(i=1;i<=l;i++)
	{
		acc=acc*16+hex[substr(a,i,1)];
	}
	return acc
}
function dumpblock()
{
	blkd=blockb ", " blockl ", rule" rules[blockr]
	blocks[blockidx]=blkd
	blockidx++
	if(blockb<=256) lat1idx++
	split(blockr,rsp,",")
	if(substr(rsp[3],2,1)=="1")
	{
		cblcks[cblckidx]=blkd
		cblckidx++
	}
	if(rsp[1]=="GENCAT_ZS")
	{
		sblcks[sblckidx]=blkd
		sblckidx++
	}
	blockb=self
	blockl=1
	blockr=rule
}
{
	name=$2
	cat=toupper($3)
	self=h2d($1)
	up=h2d($13)
	low=h2d($14)
	title=h2d($15)
	convpos=1
	if((up==0)&&(low==0)&&(title==0)) convpos=0
	if(up==0) up=self
	if(low==0) low=self
	if(title==0) title=self
	updist=up-self
	lowdist=low-self
	titledist=title-self
	rule="GENCAT_"cat", NUMCAT_"cat", "((convpos==1)?                   \
				("1, " updist ", " lowdist ", " titledist): \
				("0, 0, 0, 0"))
	if(cats[cat]=="")
	{
		cats[cat]=(2^catidx);
		catidx++;
	}
	if(rules[rule]=="")
	{
		rules[rule]=rulidx;
		rulidx++;
	}
	if(blockb==-1)
	{
		blockb=self
		blockl=1
		blockr=rule
	}
	else
	{
		if (index(name,"First>")!=0)
		{
			dumpblock()
		}
		else if (index(name,"Last>")!=0)
		{
			blockl+=(self-blockb)
		}
		else if((self==blockb+blockl)&&(rule==blockr)) blockl++
		else
		{
			dumpblock()
		}
	}
}
END {
	dumpblock()
	for(c in cats) print "var GENCAT_"c" = "cats[c]";"
	print "var MAX_UNI_CHAR = " self ";"
	print "var NUM_BLOCKS = " blockidx ";"
	print "var NUM_CONVBLOCKS = " cblckidx ";"
	print "var NUM_SPACEBLOCKS = " sblckidx ";"
	print "var NUM_LAT1BLOCKS = " lat1idx ";"
	print "var NUM_RULES = " rulidx ";"
	for(r in rules)
	{
		printf "var rule" rules[r] "=_convrule_(" r ");\n"
	}
	print "var allchars = ["
	for(i=0;i<blockidx;i++)
	{
		printf "\t_charblock_(" blocks[i] ")"
		print (i<(blockidx-1))?",":"" 
	}
	print "];"
	print "var convchars = ["
	for(i=0;i<cblckidx;i++)
	{
		printf "\t_charblock_(" cblcks[i] ")"
		print (i<(cblckidx-1))?",":""
	}
	print "];"
	print "var spacechars = ["
	for(i=0;i<sblckidx;i++)
	{
		printf "\t_charblock_(" sblcks[i] ")"
		print (i<(sblckidx-1))?",":""
	}
	print "];"
}
'
#   Output the C procedures code

cat <<EOF

/*
	Obtain the reference to character rule by doing
	binary search over the specified array of blocks.
	To make checkattr shorter, the address of
	nullrule is returned if the search fails:
	this rule defines no category and no conversion
	distances. The compare function returns 0 when
	key->start is within the block. Otherwise
	result of comparison of key->start and start of the
	current block is returned as usual.
*/

var nullrule = _convrule_(0,NUMCAT_CN,0,0,0,0);

function blkcmp(/* int */ key_char, /* _charblock_ */ cur) {
	if ((key_char >= cur.start) && (key_char < (cur.start + cur.length))) {
		return 0;
	}
	if (key_char > cur.start) {
		return 1;
	}
	return -1;
}

function bsearch (array, length, key, compareFunc) {
	var lower = 0;
	var upper = length;

	while (lower < upper) {
		var idx = Math.floor((lower + upper) / 2);
		var comparison = compareFunc(key, array[idx]);
		if (comparison < 0) {
			upper = idx;
		}
		else if (comparison > 0) {
			lower = idx + 1;
		}
		else {
			return array[idx];
		}
	}

  return null;
}

function getrule(
	/* const struct _charblock_ */ blocks,
	/* int */ numblocks,
	/* int */ unichar)
{
	var char_block = bsearch(blocks, numblocks, unichar, blkcmp);
	if (char_block === null) {
		return nullrule;
	}
	return char_block.rule;
}
	


/*
	Check whether a character (internal code) has certain attributes.
	Attributes (category flags) may be ORed. The function ANDs
	character category flags and the mask and returns the result.
	If the character belongs to one of the categories requested,
	the result will be nonzero.
*/

function checkattr(/* int */ c, /* unsigned int */ catmask) {
	var numblocks = (c < 256) ? NUM_LAT1BLOCKS : NUM_BLOCKS;
	var rule = getrule(allchars, numblocks, c);
	return (catmask & rule.category);
}

function checkattr_s(/* int */ c, /* unsigned int */ catmask)
{
	var rule = getrule(spacechars, NUM_SPACEBLOCKS, c)
	return (catmask & rule.category);
}

/*
	Define predicate functions for some combinations of categories.
*/


function unipred(m) {
	return function(c) {
		return checkattr(c, m);
	};
};

function unipred_s(m) {
	return function(c) {
		return checkattr_s(c, m);
	};
};

/*
	Make these rules as close to Hugs as possible.
*/

var u_iswcntrl = unipred(GENCAT_CC);
var u_iswprint = unipred(
  GENCAT_MC | GENCAT_NO | GENCAT_SK | GENCAT_ME | GENCAT_ND |
  GENCAT_PO | GENCAT_LT | GENCAT_PC | GENCAT_SM | GENCAT_ZS |
  GENCAT_LU | GENCAT_PD | GENCAT_SO | GENCAT_PE | GENCAT_PF |
  GENCAT_PS | GENCAT_SC | GENCAT_LL | GENCAT_LM | GENCAT_PI |
  GENCAT_NL | GENCAT_MN | GENCAT_LO);

var u_iswspace = unipred_s(GENCAT_ZS);
var u_iswupper = unipred(GENCAT_LU | GENCAT_LT);
var u_iswlower = unipred(GENCAT_LL);
var u_iswalpha = unipred(GENCAT_LL | GENCAT_LU | GENCAT_LT | GENCAT_LM | GENCAT_LO);
var u_iswdigit = unipred(GENCAT_ND);
var u_iswalnum = unipred(
	GENCAT_LT | GENCAT_LU | GENCAT_LL | GENCAT_LM | GENCAT_LO |
	GENCAT_MC | GENCAT_ME | GENCAT_MN | GENCAT_NO | GENCAT_ND | GENCAT_NL);

function caseconv(getDist) {
	return function(c) {
		var /*_convrule_*/ rule = getrule(convchars, NUM_CONVBLOCKS, c);
		if (rule == nullrule) {
			return c;
		}
		return c + getDist(rule);
	}
};

var u_towupper = caseconv(function(x) { return x.updist; });
var u_towlower = caseconv(function(x) { return x.lowdist; });
var u_towtitle = caseconv(function(x) { return x.titledist; });

function u_gencat(c) {
	return getrule(allchars, NUM_BLOCKS, c).catnumber;
}

EOF
